# Load libraries and data -------------------------------------------------
library(tidyverse)
library(haven)
library(car)

# Experiment 1 ------------------------------------------------------------

df1 <- read_dta("../original_data/ms_sometime_voter_with_all_treatments.dta") |> 
  # Remake the sex, race, and age categories
  mutate(female = case_when(sex == "F" ~ 1,
                            TRUE ~ 0), # In original Stata file, there is a 3rd category for NA for missing
         gender_miss = case_when(sex == "U" ~ 1,
                                 TRUE ~ 0),
         male = case_when(sex == "M" ~ 1,
                          TRUE ~ 0),
         female_imputed = case_when(sex == "F" ~ 1,
                                    sex == "M" ~ 0,
                                    sex == "U" ~ NA_real_),
         female_imputed = replace(female_imputed, is.na(female_imputed), mean(female_imputed, na.rm = TRUE)),
         # Our data is either 46,015 Black, 33,544 White, and 150,000 Unknown/Blank, and 40 Other
         african_am = case_when(racename == "African American" ~ 1,
                                TRUE ~ 0),
         asian = case_when(racename == "Asian" ~ 1,
                           TRUE ~ 0),
         minority_unknown = case_when(racename == "Unknown" | racename == "Other" ~ 1,
                                      TRUE ~ 0),
         white = case_when(racename == "White" ~ 1,
                           TRUE ~ 0),
         native_am = case_when(racename == "Native American" ~ 1,
                               TRUE ~ 0),
         # Age
         age_miss = case_when(is.na(age) ~ 1,
                              TRUE ~ 0),
         age_imputed = replace(age, age_miss == 1, mean(age, na.rm = TRUE))) |> 
  # Label treatments, relevel Control as baseline
  mutate(treatment = relevel(haven::as_factor(treatment), ref = "Control")) |> 
  # Add turnout data from a separate voting file
  left_join(read_dta("../original_data/vanid_to_2014voting.dta"), by = "vanid") |> 
  # Only get the treatment conditions that we need, then drop extra levels
  filter(treatment %in% c("Control", "Villain/Happy", 
                          "Villain/Disappoint", "Respect/Happy",
                          "Respect/Disappoint")) |> 
  droplevels() |> 
  mutate(treatment = relevel(factor(treatment), ref = "Control")) |> 
  # Make binary treatment variables for everything
  mutate(treat_gloatingvillain = case_when(treatment == "Villain/Happy" ~ 1,
                                           TRUE ~ 0),
         treat_foiledvillain = case_when(treatment == "Villain/Disappoint" ~ 1,
                                         TRUE ~ 0),
         treat_happyhero = case_when(treatment == "Respect/Happy" ~ 1,
                                     TRUE ~ 0),
         treat_disappointedhero = case_when(treatment == "Respect/Disappoint" ~ 1,
                                            TRUE ~ 0)) %>% 
  # Double-check voter history for 2008, 2010, 2011, and 2012 general elections.
  # 0 means registered and didn't vote, 1 means registered and voted, and 9 means was not registered
  mutate(votehistgroup = as.integer(factor(str_voter_history))) %>% 
  # Get rid of all PII
  select(-c(maddress:regdate_asnum))

# Experiment 2 ------------------------------------------------------------
df2 <- read_delim("../original_data/gotv_2019_FL_june_specials_program_data.txt") %>% 
  # Florida field experiment only has two conditions
  mutate(treat_villain = case_when(condition == "Happy Villain" ~ 1,
                                   TRUE ~ 0),
         treat_reportcard = case_when(condition == "Report Card" ~ 1,
                                      TRUE ~ 0)) %>% 
  # Combine Native American and Unknown with other
  mutate(racenum = case_when(race == "nativeAmerican" ~ "other",
                             race == "unknown" ~ "other",
                             TRUE ~ race),
         white = case_when(race == "caucasian" ~ 1,
                           TRUE ~ 0),
         asian = case_when(race == "asian" ~ 1,
                           TRUE ~ 0),
         black = case_when(race == "black" ~ 1,
                           TRUE ~ 0),
         hispanic = case_when(race == "hispanic" ~ 1,
                              TRUE ~ 0),
         racenum = relevel(factor(racenum), ref = "caucasian"),
         female = case_when(gender == "female" ~ 1,
                            TRUE ~ 0),
         bc_elections_voted = voted2012g + voted2014g+ voted2016g +voted2018g,
         yob = as.numeric(str_extract(birthdate, "\\d\\d\\d\\d")),
         age = 2019 - yob,
         age2 = age^2,
         married = case_when(catalistsynthetic_married =="M" ~ 1,
                             TRUE ~ 0),
         ideology2 = catalistmodel_ideology^2, 
         statehousedistrict = factor(statehousedistrict),
         hd38 = case_when(statehousedistrict == 38 ~ 1,
                          TRUE ~ 0),
         bc_mailaddressuspstype_H = case_when(mailaddruspstype == "H" ~ 1,
                                              TRUE ~ 0)) %>% 
  select(-birthdate)


# Survey A ----------------------------------------------------------------
dfA <- read_csv("../original_data/Villain%2FHero+2+%2B+Policy+Ranking+Pilot+-+MTurk_June+6%2C+2017_14.10.csv") |> 
  slice(3:n()) |> 
  #left_join(read_csv("data/VillainConstrual_MTurkStudy2_ApprovedAssignments.csv") %>% 
  #            select("ResponseId" = surveycode, inQualtrics)) %>% 
  #filter(!is.na(inQualtrics)) %>% 
  select(StartDate:feel2_DO_8, party:party_ind, zWhoDraw:zAction2, 
         "who_1" = who1, "why_1" = why1, who_2 = who2 , "why_2" = why2,
         "zWhoQ_1" = zWho1Q, "zWhoQ_2" = zWho2Q, zWho_1 = zWho1, zWho_2 = zWho2,
         zAction_1 = zAction1, zAction_2 = zAction2,
         -contains("_DO_")) |> 
  # Rename the feel variables with the emotions associated with them and make them into numeric
  rename_with(~ paste0("feel", c("angry", "happy", "indiff", "guilty", "smug", "disapp", "proud", "shame"), "_1"), starts_with("feel1_")) %>% 
  rename_with(~ paste0("feel", c("angry", "happy", "indiff", "guilty", "smug", "disapp", "proud", "shame"), "_2"), starts_with("feel2_")) %>% 
  mutate(across(starts_with("feel"), ~ as.numeric(.))) |> 
  # Create net variables that subtract voted from not voted
  mutate(across(
    .cols = paste0("feel", c("angry", "happy", "indiff", "guilty", "smug", "disapp", "proud", "shame"), "_1"),
    .fns = ~ case_when(
      zAction_1 == "voted" ~ .x - cur_data()[[gsub("1", "2", cur_column())]], # subtract 1 - 2
      zAction_1 == "did not vote" ~ cur_data()[[gsub("1", "2", cur_column())]] - .x # subtract 2 - 1
    ),
    .names = "{paste0('net_', str_remove({col}, '_1'))}"
  )) |> 
  # make treatment dummys afterwards -- each respondent is put into one
  mutate(treatment = case_when(zRespect == "can't stand" & zVote == "vote" ~ "Foiled_Villain",
                               zRespect == "can't stand" & zVote == "do not vote" ~ "Gloating_Villain",
                               zRespect == "respect" & zVote == "vote" ~ "Happy_Hero",
                               zRespect == "respect" & zVote == "do not vote" ~ "Disappointed_Hero"),
         treatment = relevel(factor(treatment), ref = "Happy_Hero"),
         villain = case_when(zRespect == "can't stand" ~ "Villain",
                             zRespect == "respect" ~ "Hero")) |> 
  # Separate out the _1 and _2 into longer data frame
  pivot_longer(cols = c(who_1:feelshame_2, zWho_1:zWhoQ_2, zAction_1:zAction_2),
               names_to = c(".value", "index"),
               names_sep = "_") |> 
  # think about the person you respect/can't stand
  mutate(think_person = case_when(zWho == "respect" ~ "hero",
                                  zWho == "can't stand" ~ "villain")) |> 
  # vote versus not vote
  mutate(vote = case_when(zAction == "voted" ~ 1,
                          zAction == "did not vote" ~ 0)) |> 
  # Make each ROW an emotion observation
  pivot_longer(cols = c(starts_with("net_"), starts_with("feel")),
               names_to = "emotion",
               values_to = "value") |> 
  # Get rid of repeats for the net with index 2 and make them have a "net" index
  filter(!(index == 2 & str_detect(emotion, "net_"))) %>% 
  mutate(index = case_when(str_detect(emotion, "net_") ~  "net",
                           TRUE ~ index)) |> 
  # Rename variables to just the emotion names
  mutate(emotion = str_remove(emotion, "net_"),
         emotion = str_remove(emotion, "feel")) |> 
  # rename DVs and create net dv variable
  select(ResponseId, RecordedDate, birthyear_1:party_ind, treatment:why, think_person:value) |> 
  mutate(RecordedDate = as.Date(RecordedDate)) 
  
# Survey B ----------------------------------------------------------------

dfB <- read_csv("../original_data/Govt_Spending__SSI__FINAL.csv") %>% 
  slice(2:n()) %>% 
  select(V1:isRandomized, zWhoDraw:zAction2, vh_js:ps8,
         ResponseId = V1, StartDate = V8,
         "who_1" = who1, "why_1" = why1, who_2 = who2 , "why_2" = why2,
         "zWhoQ_1" = zWho1Q, "zWhoQ_2" = zWho2Q, zWho_1 = zWho1, zWho_2 = zWho2,
         zAction_1 = zAction1, zAction_2 = zAction2) %>% 
  filter(vh_js == 1) |> 
  rename_with(~ paste0("feel", c("angry", "happy", "guilty", "smug", "proud", "shame"), "_1"), starts_with("feel1_")) %>% 
  rename_with(~ paste0("feel", c("angry", "happy", "guilty", "smug", "proud", "shame"), "_2"), starts_with("feel2_")) %>% 
  mutate(across(starts_with("feel"), ~ as.numeric(.))) |> 
  mutate(across(
    .cols = paste0("feel", c("angry", "happy", "guilty", "smug", "proud", "shame"), "_1"),
    .fns = ~ case_when(
      zAction_1 == "voted" ~ .x - cur_data()[[gsub("1", "2", cur_column())]],
      zAction_1 == "did not vote" ~ cur_data()[[gsub("1", "2", cur_column())]] - .x
    ),
    .names = "{paste0('net_', str_remove({col}, '_1'))}"
  )) %>% 
  pivot_longer(cols = c(zWho_1:zWhoQ_2, zAction_1:zAction_2, who_1:why_2, feelangry_1:feelshame_1,
                        feelangry_2:feelshame_2),
               names_to = c(".value", "index"),
               names_sep = "_") %>% 
  # think about the person you respect/can't stand
  mutate(think_person = case_when(zWho == "respect" ~ "hero",
                                  zWho == "can't stand" ~ "villain")) %>% 
  # make treatment dummys afterwards -- each respondent is put into one
  mutate(treatment = case_when(zRespect == "can't stand" & zVote == "vote" ~ "Foiled_Villain",
                               zRespect == "can't stand" & zVote == "do not vote" ~ "Gloating_Villain",
                               zRespect == "respect" & zVote == "vote" ~ "Happy_Hero",
                               zRespect == "respect" & zVote == "do not vote" ~ "Disappointed_Hero"),
         treatment = relevel(factor(treatment), ref = "Happy_Hero"),
         villain = case_when(zRespect == "can't stand" ~ "Villain",
                             zRespect == "respect" ~ "Hero")) %>% 
  mutate(vote = case_when(zAction == "voted" ~ 1,
                          zAction == "did not vote" ~ 0)) |> 
  pivot_longer(cols = c(starts_with("net_"), starts_with("feel")),
               names_to = "emotion",
               values_to = "value") |> 
  filter(!(index == 2 & str_detect(emotion, "net_"))) %>% 
  mutate(index = case_when(str_detect(emotion, "net_") ~  "net",
                           TRUE ~ index)) %>% 
  mutate(emotion = str_remove(emotion, "net_"),
         emotion = str_remove(emotion, "feel")) %>% 
  # rename DVs and create net dv variable
  dplyr::select(ResponseId, StartDate, ps0:index, who:value) %>% 
  mutate(StartDate = as.Date(StartDate)) 




# Survey C ----------------------------------------------------------------

dfC_respondents <- read_csv("../original_data/DemographicsFromBovitz.csv") |> 
  filter(age >= 18) %>%
  # Age variable
  mutate(d_age = age) %>%
  # Education
  # 1 is HS or less, 2 is Some college or trade, associates, 3 is BA, 4 is Post-BA
  mutate(d_educ = case_when(education == 1 ~ 1,
                            education == 2 ~ 1,
                            education >= 3 & education <= 5 ~ 2,
                            education == 6 ~ 3,
                            education > 6 ~ 4)) %>% 
  # Ethnicity
  # For white only, only qualifies if it's their only identity
  mutate(d_iswhite = case_when(ethnicity == 1 ~ 1,
                               TRUE ~ 0),
         d_isblack = case_when(str_detect(ethnicity, "2") ~ 1,
                               TRUE ~ 0),
         d_isasian = case_when(str_detect(ethnicity, "3") ~ 1,
                               TRUE ~ 0),
         d_isother = case_when(str_detect(ethnicity, "4|5|98|99") ~ 1,
                               TRUE ~ 0)) |> 
  # Hispanic
  mutate(d_hispanic = case_when(hispanic == 1 ~ 1,
                                TRUE ~ 0)) |> 
  # Female (1 = female, 0 = male) and Non-binary (1 = non-binary, 0 = straight)
  mutate(d_female = case_when(gender == 2 ~ 1,
                              TRUE ~ 0),
         d_nonbinary = case_when(gender == 3 ~ 1,
                                 TRUE ~ 0)) |> 
  # Income (1-11; 6 = refused)
  mutate(d_income = case_when(income == 99 ~ 6,
                              TRUE ~ income),
         d_incomerf = case_when(income == 99 ~ 1,
                                TRUE ~ 0)) |> 
  # Ideology (1=Extremely liberal, 7=Extremely Consv.)
  mutate(d_ideology = political_ideology) |> 
  # Partisanship (1=Strong Dem.; 7=Strong Rep)
  mutate(d_pid7 = case_when(political_party_preference == 98 ~ 4,
                            political_party_preference == 99 ~ 4,
                            TRUE ~ political_party_preference)) |> 
  select(respondent_id, matches("^d_")) |> 
  mutate(across(starts_with("d_"), as.numeric))


dfC <- read_csv("../original_data/Gloating+Villain+BJPS+Replication_October+12,+2024_11.03.csv") |> 
  # Drop test responses and incompletes
  filter(Status == "IP Address" & DistributionChannel == "anonymous") |> 
  # Keep only consent
  filter(Consent == "I agree to participate") |> 
  # Filter out respondents not from survey vendor
  filter(!is.na(RESPONDENT_ID)) |> 
  # Filter out test respondent ids from survey vendor
  filter(RESPONDENT_ID !="ricktest444"&
         RESPONDENT_ID !="iP60590b8cea02cfa0" &
         RESPONDENT_ID !="testtesttest333" &
         RESPONDENT_ID !="iP9dbd979bb844b32d" &
         RESPONDENT_ID !="iP28dc6c86156312a7" &
         RESPONDENT_ID !="iPa1551232f5324c17") |> 
  # Keep only the first response for people who took it multiple times, even if unfinished
  group_by(RESPONDENT_ID) |> 
  slice(1) |> 
  ungroup() |> 
  # Same thing with IP Address
  group_by(IPAddress) |> 
  slice(1) |> 
  ungroup() |> 
  # Drop suspicious coordinates data for people outside the US
  # This also filters out unifinished respondents, since their lat/long is NA
  mutate(LocationLatitude = as.numeric(LocationLatitude),
         LocationLongitude = as.numeric(LocationLongitude)) |> 
  filter(LocationLatitude > 19 & LocationLatitude < 65) |> 
  filter(LocationLongitude > -165 & LocationLongitude < -60) |> 
  # Merge in respondents dataset
  left_join(dfC_respondents, by = c("RESPONDENT_ID" = "respondent_id")) |> 
  # Attention check variables
  mutate(got_easy_acq = case_when(news_howidentified == "Because he left his ID" ~ 1,
                                  TRUE ~ 0),
         got_hard_acq = case_when(news_howmuchmoney == "About $1,500" ~ 1,
                                  TRUE ~ 0)) |> 
  # Relabel and factor Treatment
  mutate(Treatment = relevel(factor(FL_24_DO), ref = "Control")) |> 
  # Select variables that we need, later move this to the end
  select(ResponseId, RESPONDENT_ID, StartDate, EndDate, Progress, `Duration (in seconds)`:RecordedDate,
         news_toomuchcrime, Control_BFWant, AR_response, PAR_response,
         Action1Anticipated_1:EmotionNow_DO, feedback, ZAction1, Zaction2, 
         Treatment, d_age:got_hard_acq, -contains("_DO")) |> 
  # Rename dependent variables
  rename_with(~ paste0("feel_", c("angry", "happy", "guilty", "smug", "proud", "shame"), "_1"), starts_with("Action1Anticipated_")) |> 
  rename_with(~ paste0("feel_", c("angry", "happy", "guilty", "smug", "proud", "shame"), "_2"), starts_with("Action2Anticipated_")) |> 
  rename_with(~ paste0("now_", c("angry", "happy", "guilty", "smug", "proud", "shame")), starts_with("EmotionNow_")) |> 
  # Separate out the _1 and _2 into longer data frame
  pivot_longer(cols = c(feel_angry_1:now_shame),
               names_to = c(".value", "emotion", "index"),
               names_sep = "_") |> 
  # Create indicators for what type of DV it is
  mutate(type = case_when(index == 1 ~ ZAction1,
                          index == 2 ~ Zaction2,
                          is.na(index) ~ "current")) |> 
  mutate(type = case_when(type == "don't vote" ~ "novote",
                          TRUE ~ type)) |> 
  # Create the value dv to combine feel (anticipated) and now
  mutate(value = case_when(type == "novote" | type == "vote" ~ feel,
                           type == "current" ~ now)) |> 
  mutate(value = case_when(value == "-99" ~ NA_real_,
                           TRUE ~ as.numeric(value))) |> 
  # Get rid of variables
  select(-c(index:now), ZAction1, Zaction2)

write_csv(df1, "experiment1_cleaned.csv")
write_csv(df2, "experiment2_cleaned.csv")
write_csv(dfA, "experimentA_cleaned.csv")
write_csv(dfB, "experimentB_cleaned.csv")
write_csv(dfC, "experimentC_cleaned.csv")
write_csv(dfC_respondents, "experimentCresp_cleaned.csv")

#save.image("cleaned_data.RData")
